Sys.setlocale('LC_ALL', 'C')
[1] "LC_CTYPE=C;LC_NUMERIC=C;LC_TIME=C;LC_COLLATE=C;LC_MONETARY=C;LC_MESSAGES=en_US.UTF-8;LC_PAPER=vi_VN;LC_NAME=C;LC_ADDRESS=C;LC_TELEPHONE=C;LC_MEASUREMENT=vi_VN;LC_IDENTIFICATION=C"
library(fpp2)
library(gam)
library(prophet)
library (gbm)
library(viridis)
Loading required package: viridisLite
setwd('/home/hieu/BDMA/TSA')
?arrivals
autoplot(arrivals)
autoplot(arrivals[,c(1,2)])
Japan<- arrivals[,1]
NZ<- arrivals[,2]
UK<- arrivals[,3]
US<- arrivals[,4]
tt<- (1:length(NZ))
seas <- factor(c(rep(1:4,length(NZ)/4),1:3))
#####1:3 because there are three observations 'out'
mod2 <- lm(NZ~ tt+seas)
summary(mod2)
AIC(mod2)
Linear model with no seasonality
mod2a <- lm(NZ~ tt)
summary(mod2a)
we now try with a GAM
# Values for df should be greater than 1, with df=1 implying a linear fit. Default is df=4
g1 <- gam(NZ~s(tt)+seas)
summary(g1)
Call: gam(formula = NZ ~ s(tt) + seas)
Deviance Residuals:
Min 1Q Median 3Q Max
-38.576 -11.971 -1.408 11.954 50.091
(Dispersion Parameter for gaussian family taken to be 307.5263)
Null Deviance: 903510.7 on 126 degrees of freedom
Residual Deviance: 36595.57 on 118.9998 degrees of freedom
AIC: 1097.675
Number of Local Scoring Iterations: NA
Anova for Parametric Effects
Df Sum Sq Mean Sq F value Pr(>F)
s(tt) 1 783533 783533 2547.856 < 2.2e-16 ***
seas 3 70207 23402 76.098 < 2.2e-16 ***
Residuals 119 36596 308
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Anova for Nonparametric Effects
Npar Df Npar F Pr(F)
(Intercept)
s(tt) 3 15.121 2.121e-08 ***
seas
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
par(mfrow=c(1,2))
plot(g1, se=T)
AIC(g1)
[1] 1097.675
# time has a nonlinear effect
g1a <- gam(NZ~s(tt))
par(mfrow=c(1,2))
plot(g1a, se=T)
summary(g1a)
Call: gam(formula = NZ ~ s(tt))
Deviance Residuals:
Min 1Q Median 3Q Max
-74.573 -18.713 4.153 20.611 76.733
(Dispersion Parameter for gaussian family taken to be 876.4429)
Null Deviance: 903510.7 on 126 degrees of freedom
Residual Deviance: 106925.9 on 121.9998 degrees of freedom
AIC: 1227.845
Number of Local Scoring Iterations: NA
Anova for Parametric Effects
Df Sum Sq Mean Sq F value Pr(>F)
s(tt) 1 783533 783533 893.99 < 2.2e-16 ***
Residuals 122 106926 876
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Anova for Nonparametric Effects
Npar Df Npar F Pr(F)
(Intercept)
s(tt) 3 4.9637 0.002762 **
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# linear model may be also performed with library(gam)
g0 <- gam(NZ~(tt)+seas)
summary(g0)
Call: gam(formula = NZ ~ (tt) + seas)
Deviance Residuals:
Min 1Q Median 3Q Max
-37.777 -15.263 -2.644 12.825 53.704
(Dispersion Parameter for gaussian family taken to be 414.3167)
Null Deviance: 903510.7 on 126 degrees of freedom
Residual Deviance: 50546.63 on 122 degrees of freedom
AIC: 1132.691
Number of Local Scoring Iterations: 2
Anova for Parametric Effects
Df Sum Sq Mean Sq F value Pr(>F)
tt 1 783533 783533 1891.14 < 2.2e-16 ***
seas 3 69431 23144 55.86 < 2.2e-16 ***
Residuals 122 50547 414
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
par(mfrow=c(1,2))
plot(g0, se=T)
AIC(g0)
[1] 1132.691
GAM with splines performs better
# try another option with loess (lo)
g2<- gam(NZ~lo(tt)+seas)
summary(g2)
Call: gam(formula = NZ ~ lo(tt) + seas)
Deviance Residuals:
Min 1Q Median 3Q Max
-39.3138 -12.3064 -0.8082 11.4180 50.4500
(Dispersion Parameter for gaussian family taken to be 329.8333)
Null Deviance: 903510.7 on 126 degrees of freedom
Residual Deviance: 39463.9 on 119.648 degrees of freedom
AIC: 1105.962
Number of Local Scoring Iterations: NA
Anova for Parametric Effects
Df Sum Sq Mean Sq F value Pr(>F)
lo(tt) 1.00 783533 783533 2375.541 < 2.2e-16 ***
seas 3.00 69943 23314 70.686 < 2.2e-16 ***
Residuals 119.65 39464 330
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Anova for Nonparametric Effects
Npar Df Npar F Pr(F)
(Intercept)
lo(tt) 2.4 14.286 6.383e-07 ***
seas
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
par(mfrow=c(1,2))
plot(g2, se=T)
AIC(g2)
[1] 1105.962
# perform analysis of residuals
tsdisplay(residuals(g1))
aar1<- auto.arima(residuals(g1))
plot(as.numeric(NZ), type="l")
lines(fitted(g1), col=3)
lines(fitted(aar1)+ fitted(g1), col=4)
# Exercise: try also with the other variables
iPhone sales
# data
iphone=read.csv("iphone.csv", sep=";")
str(iphone)
'data.frame': 46 obs. of 3 variables:
$ iphone : num 0.27 1.12 2.32 1.7 0.72 6.89 4.36 3.79 5.21 7.37 ...
$ quarter: chr "01/10/2007" "31/12/2007" "31/03/2008" "01/07/2008" ...
$ cap : int 100 100 100 100 100 100 100 100 100 100 ...
# some preliminary analysis on the data
iphone$quarter=as.Date(iphone$quarter, format="%d/%m/%Y")
str(iphone)
'data.frame': 46 obs. of 3 variables:
$ iphone : num 0.27 1.12 2.32 1.7 0.72 6.89 4.36 3.79 5.21 7.37 ...
$ quarter: Date, format: "2007-10-01" "2007-12-31" "2008-03-31" ...
$ cap : int 100 100 100 100 100 100 100 100 100 100 ...
colnames(iphone)=c("y","ds","cap")
summary(iphone$y) ##to explain why cap is 100
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.270 8.742 35.130 31.916 47.370 78.290
#plot
plot(iphone$y, type="l", x=iphone$ds, ylab="", xlab="Year")
# nonlinear trend
# seasonality
# Prophet model
# model with no seasonality
# m2=prophet(iphone, yearly.seasonality=F, daily.seasonality=F, weekly.seasonality = F, growth="logistic", n.changepoints=15)
# model with automatic selection of seasonality (default is "auto")
# in this case just yearly seasonality, if daily and weekly seasonsonality are needed we need to specify
# changepoints not specified, automatically set
m2=prophet(iphone, growth="logistic")
Disabling weekly seasonality. Run prophet with weekly.seasonality=TRUE to override this.
Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this.
# n.changepoints default number is 25
summary(m2)
Length Class Mode
growth 1 -none- character
changepoints 25 POSIXct numeric
n.changepoints 1 -none- numeric
changepoint.range 1 -none- numeric
yearly.seasonality 1 -none- character
weekly.seasonality 1 -none- character
daily.seasonality 1 -none- character
holidays 0 -none- NULL
seasonality.mode 1 -none- character
seasonality.prior.scale 1 -none- numeric
changepoint.prior.scale 1 -none- numeric
holidays.prior.scale 1 -none- numeric
mcmc.samples 1 -none- numeric
interval.width 1 -none- numeric
uncertainty.samples 1 -none- numeric
specified.changepoints 1 -none- logical
start 1 POSIXct numeric
y.scale 1 -none- numeric
logistic.floor 1 -none- logical
t.scale 1 -none- numeric
changepoints.t 25 -none- numeric
seasonalities 1 -none- list
extra_regressors 0 -none- list
country_holidays 0 -none- NULL
stan.fit 4 -none- list
params 6 -none- list
history 7 data.frame list
history.dates 46 POSIXct numeric
train.holiday.names 0 -none- NULL
train.component.cols 3 data.frame list
component.modes 2 -none- list
fit.kwargs 0 -none- list
# create a future 'window' for prediction
future2 <- make_future_dataframe(m2, periods = 20, freq="quarter", include_history = T)
tail(future2)
future2$cap=100
forecast2 <- predict(m2, future2)
tail(forecast2[c("ds", "yhat", "yhat_lower", "yhat_upper")])
# prediction plot
plot(m2, forecast2)
# Dynamic plot
dyplot.prophet(m2, forecast2)
# plot with change points
plot(m2, forecast2)+add_changepoints_to_plot(m2, threshold=0)
# dates corresponding to change points
m2$changepoints
[1] 0
# Prophet with no change points and multiplicative seasonality
m2=prophet(iphone, growth="logistic", n.changepoints=0, seasonality.mode='multiplicative')
Disabling weekly seasonality. Run prophet with weekly.seasonality=TRUE to override this.
Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this.
# summary(m2)
# m2$seasonalities
future2 <- make_future_dataframe(m2, periods = 20, freq="quarter", include_history = T)
tail(future2)
future2$cap=100
forecast2 <- predict(m2, future2)
tail(forecast2[c("ds", "yhat", "yhat_lower", "yhat_upper")])
# prediction plot
plot(m2, forecast2)
dyplot.prophet(m2, forecast2)
‘Movies’ Dataset
data <- read.csv("movies.csv", stringsAsFactors=TRUE)
str(data)
'data.frame': 3229 obs. of 27 variables:
$ X.1 : int 1 2 3 4 5 6 7 8 9 10 ...
$ X : int 1 2 3 4 5 6 7 8 9 10 ...
$ budget : int 237000000 300000000 245000000 250000000 260000000 258000000 260000000 280000000 250000000 250000000 ...
$ popularity : num 1.50e+08 1.39e+08 1.07e+08 1.12e+07 4.39e+07 ...
$ production_countries: int 1 1 1 1 1 1 1 1 1 1 ...
$ release_date : Factor w/ 2494 levels "01/01/1961","01/01/1969",..: 785 1552 2190 1304 470 40 2005 1792 499 1874 ...
$ revenue : num 2.79e+09 9.61e+08 8.81e+08 1.08e+09 2.84e+08 ...
$ runtime : int 162 169 148 165 132 139 100 141 153 151 ...
$ spoken_languages : int 1 1 1 1 1 1 1 1 1 1 ...
$ vote_average : num 7.2 6.9 6.3 7.6 6.1 5.9 7.4 7.3 7.4 5.7 ...
$ vote_classes : Factor w/ 5 levels "0 a 5","5 a 6",..: 4 3 3 4 3 2 4 4 4 2 ...
$ Comp_Universal : int 0 0 0 0 0 0 0 0 0 0 ...
$ Comp_Paramount : int 0 0 0 0 0 0 0 0 0 0 ...
$ Comp_Warner : int 0 0 0 1 0 0 0 0 1 1 ...
$ Comp_20fox : int 1 0 0 0 0 0 0 0 0 0 ...
$ Comp_Columbia : int 0 0 1 0 0 1 0 0 0 0 ...
$ Comp_NewLine : int 0 0 0 0 0 0 0 0 0 0 ...
$ Comp_Disney : int 0 1 0 0 1 0 1 0 0 0 ...
$ Comp_VillageRoadshow: int 0 0 0 0 0 0 0 0 0 0 ...
$ Comp_Miramax : int 0 0 0 0 0 0 0 0 0 0 ...
$ Comp_Pixar : int 0 0 0 0 0 0 0 0 0 0 ...
$ Comp_RelMedia : int 0 0 0 0 0 0 0 0 0 0 ...
$ Comp_MGM : int 0 0 0 0 0 0 0 0 0 0 ...
$ Comp_DreamWorks : int 0 0 0 0 0 0 0 0 0 0 ...
$ Comp_Touchstone : int 0 0 0 0 0 0 0 0 0 0 ...
$ Comp_CanalPlus : int 0 0 0 0 0 0 0 0 0 0 ...
$ genere : Factor w/ 19 levels "Action","Adventure",..: 9 2 1 1 1 1 3 1 9 1 ...
# erase columns of indicator variables
data<-data[,-c(1,2)]
# transform variable release_date in format "data"
data$release_date <- as.Date(data$release_date, "%d/%m/%Y")
str(data)
'data.frame': 3229 obs. of 25 variables:
$ budget : int 237000000 300000000 245000000 250000000 260000000 258000000 260000000 280000000 250000000 250000000 ...
$ popularity : num 1.50e+08 1.39e+08 1.07e+08 1.12e+07 4.39e+07 ...
$ production_countries: int 1 1 1 1 1 1 1 1 1 1 ...
$ release_date : Date, format: "2009-12-10" "2007-05-19" "2015-10-26" ...
$ revenue : num 2.79e+09 9.61e+08 8.81e+08 1.08e+09 2.84e+08 ...
$ runtime : int 162 169 148 165 132 139 100 141 153 151 ...
$ spoken_languages : int 1 1 1 1 1 1 1 1 1 1 ...
$ vote_average : num 7.2 6.9 6.3 7.6 6.1 5.9 7.4 7.3 7.4 5.7 ...
$ vote_classes : Factor w/ 5 levels "0 a 5","5 a 6",..: 4 3 3 4 3 2 4 4 4 2 ...
$ Comp_Universal : int 0 0 0 0 0 0 0 0 0 0 ...
$ Comp_Paramount : int 0 0 0 0 0 0 0 0 0 0 ...
$ Comp_Warner : int 0 0 0 1 0 0 0 0 1 1 ...
$ Comp_20fox : int 1 0 0 0 0 0 0 0 0 0 ...
$ Comp_Columbia : int 0 0 1 0 0 1 0 0 0 0 ...
$ Comp_NewLine : int 0 0 0 0 0 0 0 0 0 0 ...
$ Comp_Disney : int 0 1 0 0 1 0 1 0 0 0 ...
$ Comp_VillageRoadshow: int 0 0 0 0 0 0 0 0 0 0 ...
$ Comp_Miramax : int 0 0 0 0 0 0 0 0 0 0 ...
$ Comp_Pixar : int 0 0 0 0 0 0 0 0 0 0 ...
$ Comp_RelMedia : int 0 0 0 0 0 0 0 0 0 0 ...
$ Comp_MGM : int 0 0 0 0 0 0 0 0 0 0 ...
$ Comp_DreamWorks : int 0 0 0 0 0 0 0 0 0 0 ...
$ Comp_Touchstone : int 0 0 0 0 0 0 0 0 0 0 ...
$ Comp_CanalPlus : int 0 0 0 0 0 0 0 0 0 0 ...
$ genere : Factor w/ 19 levels "Action","Adventure",..: 9 2 1 1 1 1 3 1 9 1 ...
# Response variable
summary(data$vote_average)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.000 5.800 6.300 6.309 6.900 8.500
#boxplot(data$vote_average, col="orange", ylim=c(0,10), main="Movies", ylab="Rating")
hist(data$vote_average, col="orange", main="Movies", xlab="Rating")
# explanatory variables
summary(data)
budget popularity production_countries release_date revenue
Min. : 1 Min. : 0 Min. :0.0000 Min. :1916-09-04 Min. :5.000e+00
1st Qu.: 10500000 1st Qu.: 7447714 1st Qu.:1.0000 1st Qu.:1998-09-11 1st Qu.:1.700e+07
Median : 25000000 Median : 18256028 Median :1.0000 Median :2005-07-20 Median :5.518e+07
Mean : 40654445 Mean : 26572600 Mean :0.9006 Mean :2002-03-21 Mean :1.212e+08
3rd Qu.: 55000000 3rd Qu.: 35307577 3rd Qu.:1.0000 3rd Qu.:2010-11-12 3rd Qu.:1.463e+08
Max. :380000000 Max. :875581305 Max. :1.0000 Max. :2016-09-09 Max. :2.788e+09
runtime spoken_languages vote_average vote_classes Comp_Universal Comp_Paramount
Min. : 41.0 Min. :0.0000 Min. :0.000 0 a 5 : 192 Min. :0.00000 Min. :0.00000
1st Qu.: 96.0 1st Qu.:1.0000 1st Qu.:5.800 5 a 6 : 843 1st Qu.:0.00000 1st Qu.:0.00000
Median :107.0 Median :1.0000 Median :6.300 6 a 7 :1426 Median :0.00000 Median :0.00000
Mean :110.7 Mean :0.9709 Mean :6.309 7 a 8 : 706 Mean :0.08826 Mean :0.08083
3rd Qu.:121.0 3rd Qu.:1.0000 3rd Qu.:6.900 8 a 10: 62 3rd Qu.:0.00000 3rd Qu.:0.00000
Max. :338.0 Max. :1.0000 Max. :8.500 Max. :1.00000 Max. :1.00000
Comp_Warner Comp_20fox Comp_Columbia Comp_NewLine Comp_Disney
Min. :0.00000 Min. :0.00000 Min. :0.00000 Min. :0.00000 Min. :0.00000
1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000
Median :0.00000 Median :0.00000 Median :0.00000 Median :0.00000 Median :0.00000
Mean :0.09291 Mean :0.06473 Mean :0.08176 Mean :0.04429 Mean :0.03685
3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000
Max. :1.00000 Max. :1.00000 Max. :1.00000 Max. :1.00000 Max. :1.00000
Comp_VillageRoadshow Comp_Miramax Comp_Pixar Comp_RelMedia Comp_MGM
Min. :0.00000 Min. :0.00000 Min. :0.000000 Min. :0.00000 Min. :0.0000
1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.000000 1st Qu.:0.00000 1st Qu.:0.0000
Median :0.00000 Median :0.00000 Median :0.000000 Median :0.00000 Median :0.0000
Mean :0.02261 Mean :0.02323 Mean :0.004955 Mean :0.03097 Mean :0.0288
3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.000000 3rd Qu.:0.00000 3rd Qu.:0.0000
Max. :1.00000 Max. :1.00000 Max. :1.000000 Max. :1.00000 Max. :1.0000
Comp_DreamWorks Comp_Touchstone Comp_CanalPlus genere
Min. :0.00000 Min. :0.0000 Min. :0.00000 Action :557
1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.00000 Drama :556
Median :0.00000 Median :0.0000 Median :0.00000 Comedy :487
Mean :0.03066 Mean :0.0288 Mean :0.03345 Thriller :253
3rd Qu.:0.00000 3rd Qu.:0.0000 3rd Qu.:0.00000 Horror :195
Max. :1.00000 Max. :1.0000 Max. :1.00000 Animation:187
(Other) :994
summary(data[,c(1,2,5,6)])
budget popularity revenue runtime
Min. : 1 Min. : 0 Min. :5.000e+00 Min. : 41.0
1st Qu.: 10500000 1st Qu.: 7447714 1st Qu.:1.700e+07 1st Qu.: 96.0
Median : 25000000 Median : 18256028 Median :5.518e+07 Median :107.0
Mean : 40654445 Mean : 26572600 Mean :1.212e+08 Mean :110.7
3rd Qu.: 55000000 3rd Qu.: 35307577 3rd Qu.:1.463e+08 3rd Qu.:121.0
Max. :380000000 Max. :875581305 Max. :2.788e+09 Max. :338.0
par(mfrow=c(2,2))
for(i in c(1,2,5,6)){
hist(data[,i], col="orange", main=paste(colnames(data)[i]), xlab="")
}
#transform quantitative variables in log scale
data$budget <- log(data$budget)
data$popularity <- log(data$popularity)
data$revenue <- log(data$revenue)
summary(data[,c(1,2,5,6)])
budget popularity revenue runtime
Min. : 0.00 Min. :-3.913 Min. : 1.609 Min. : 41.0
1st Qu.:16.17 1st Qu.:15.823 1st Qu.:16.649 1st Qu.: 96.0
Median :17.03 Median :16.720 Median :17.826 Median :107.0
Mean :16.80 Mean :16.213 Mean :17.491 Mean :110.7
3rd Qu.:17.82 3rd Qu.:17.380 3rd Qu.:18.801 3rd Qu.:121.0
Max. :19.76 Max. :20.590 Max. :21.749 Max. :338.0
par(mfrow=c(2,2))
for(i in c(1,2,5,6)){
hist(data[,i], col="orange", main=paste(colnames(data)[i]), xlab="")
}
#go back to orginal panel
par(mfrow=c(1,1))
#transform release_date in numeric
data$release_date<-as.numeric(data$release_date)
summary(data$release_date)
Min. 1st Qu. Median Mean 3rd Qu. Max.
-19477 10480 12984 11767 14925 17053
#as.numeric(as.Date("1970-01-01"))
# Set train and test
set.seed(1)
train = sample (1:nrow(data), 0.7*nrow(data))
data.train=data[train ,]
data.test=data[-train ,]
# make some variables factor
data.train[,c(3,7, 10:24)]= lapply(data.train[,c(3,7, 10:24)],factor)
data.test[,c(3,7, 10:24)]= lapply(data.test[,c(3,7, 10:24)],factor)
str(data.train)
'data.frame': 2260 obs. of 25 variables:
$ budget : num 17.6 17.9 16.5 17.7 17.1 ...
$ popularity : num 15.1 16.8 16.7 15 16.6 ...
$ production_countries: Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 1 1 2 ...
$ release_date : num 9486 13138 16912 10172 13719 ...
$ revenue : num 16.4 18.7 16.8 16.2 18.3 ...
$ runtime : int 192 94 94 114 104 106 89 104 93 105 ...
$ spoken_languages : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
$ vote_average : num 7.1 5.7 6 5.9 6.1 5.7 5.5 3.7 4.3 5.4 ...
$ vote_classes : Factor w/ 5 levels "0 a 5","5 a 6",..: 4 2 3 2 3 2 2 1 1 2 ...
$ Comp_Universal : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
$ Comp_Paramount : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
$ Comp_Warner : Factor w/ 2 levels "0","1": 1 1 1 2 2 1 1 1 1 1 ...
$ Comp_20fox : Factor w/ 2 levels "0","1": 1 2 1 1 1 1 1 1 1 1 ...
$ Comp_Columbia : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
$ Comp_NewLine : Factor w/ 2 levels "0","1": 1 1 2 1 1 1 1 2 1 2 ...
$ Comp_Disney : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
$ Comp_VillageRoadshow: Factor w/ 2 levels "0","1": 1 1 1 1 2 1 1 1 1 1 ...
$ Comp_Miramax : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
$ Comp_Pixar : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
$ Comp_RelMedia : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
$ Comp_MGM : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
$ Comp_DreamWorks : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
$ Comp_Touchstone : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
$ Comp_CanalPlus : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
$ genere : Factor w/ 19 levels "Action","Adventure",..: 10 4 1 17 4 9 11 4 4 4 ...
m1 <- lm(vote_average~.-vote_classes, data=data.train)
summary(m1)
# Stepwise Regression
m2 <- step(m1, direction='both')
summary(m2)
#Prediction
p.lm <- predict(m2, newdata=data.test)
dev.lm <- sum((p.lm-data.test$vote_average)^2)
dev.lm
AIC(m2)
# Stepwise GAM
# Start with a linear model (df=1)
g3 <- gam(vote_average~.-vote_classes, data=data.train)
# Show the linear effects
par(mfrow=c(3,5))
plot(g3, se=T)
# Perform stepwise selection using gam scope
# Values for df should be greater than 1, with df=1 implying a linear fit
sc = gam.scope(data.train[,-8], response=8, arg=c("df=2","df=3","df=4"))
g4<- step.Gam(g3, scope=sc, trace=T)
Start: vote_average ~ . - vote_classes; AIC= 4768.761
Step:1 vote_average ~ budget + s(popularity, df = 2) + production_countries + release_date + revenue + runtime + spoken_languages + Comp_Universal + Comp_Paramount + Comp_Warner + Comp_20fox + Comp_Columbia + Comp_NewLine + Comp_Disney + Comp_VillageRoadshow + Comp_Miramax + Comp_Pixar + Comp_RelMedia + Comp_MGM + Comp_DreamWorks + Comp_Touchstone + Comp_CanalPlus + genere ; AIC= 4669.558
Step:2 vote_average ~ budget + s(popularity, df = 3) + production_countries + release_date + revenue + runtime + spoken_languages + Comp_Universal + Comp_Paramount + Comp_Warner + Comp_20fox + Comp_Columbia + Comp_NewLine + Comp_Disney + Comp_VillageRoadshow + Comp_Miramax + Comp_Pixar + Comp_RelMedia + Comp_MGM + Comp_DreamWorks + Comp_Touchstone + Comp_CanalPlus + genere ; AIC= 4613.909
Step:3 vote_average ~ s(budget, df = 2) + s(popularity, df = 3) + production_countries + release_date + revenue + runtime + spoken_languages + Comp_Universal + Comp_Paramount + Comp_Warner + Comp_20fox + Comp_Columbia + Comp_NewLine + Comp_Disney + Comp_VillageRoadshow + Comp_Miramax + Comp_Pixar + Comp_RelMedia + Comp_MGM + Comp_DreamWorks + Comp_Touchstone + Comp_CanalPlus + genere ; AIC= 4568.704
Step:4 vote_average ~ s(budget, df = 2) + s(popularity, df = 3) + production_countries + release_date + revenue + s(runtime, df = 2) + spoken_languages + Comp_Universal + Comp_Paramount + Comp_Warner + Comp_20fox + Comp_Columbia + Comp_NewLine + Comp_Disney + Comp_VillageRoadshow + Comp_Miramax + Comp_Pixar + Comp_RelMedia + Comp_MGM + Comp_DreamWorks + Comp_Touchstone + Comp_CanalPlus + genere ; AIC= 4524.893
Step:5 vote_average ~ s(budget, df = 2) + s(popularity, df = 4) + production_countries + release_date + revenue + s(runtime, df = 2) + spoken_languages + Comp_Universal + Comp_Paramount + Comp_Warner + Comp_20fox + Comp_Columbia + Comp_NewLine + Comp_Disney + Comp_VillageRoadshow + Comp_Miramax + Comp_Pixar + Comp_RelMedia + Comp_MGM + Comp_DreamWorks + Comp_Touchstone + Comp_CanalPlus + genere ; AIC= 4496.516
Step:6 vote_average ~ s(budget, df = 3) + s(popularity, df = 4) + production_countries + release_date + revenue + s(runtime, df = 2) + spoken_languages + Comp_Universal + Comp_Paramount + Comp_Warner + Comp_20fox + Comp_Columbia + Comp_NewLine + Comp_Disney + Comp_VillageRoadshow + Comp_Miramax + Comp_Pixar + Comp_RelMedia + Comp_MGM + Comp_DreamWorks + Comp_Touchstone + Comp_CanalPlus + genere ; AIC= 4484.31
Step:7 vote_average ~ s(budget, df = 3) + s(popularity, df = 4) + production_countries + s(release_date, df = 2) + revenue + s(runtime, df = 2) + spoken_languages + Comp_Universal + Comp_Paramount + Comp_Warner + Comp_20fox + Comp_Columbia + Comp_NewLine + Comp_Disney + Comp_VillageRoadshow + Comp_Miramax + Comp_Pixar + Comp_RelMedia + Comp_MGM + Comp_DreamWorks + Comp_Touchstone + Comp_CanalPlus + genere ; AIC= 4472.868
Step:8 vote_average ~ s(budget, df = 3) + s(popularity, df = 4) + production_countries + s(release_date, df = 2) + revenue + s(runtime, df = 3) + spoken_languages + Comp_Universal + Comp_Paramount + Comp_Warner + Comp_20fox + Comp_Columbia + Comp_NewLine + Comp_Disney + Comp_VillageRoadshow + Comp_Miramax + Comp_Pixar + Comp_RelMedia + Comp_MGM + Comp_DreamWorks + Comp_Touchstone + Comp_CanalPlus + genere ; AIC= 4463.158
Step:9 vote_average ~ s(budget, df = 3) + s(popularity, df = 4) + production_countries + s(release_date, df = 2) + s(revenue, df = 2) + s(runtime, df = 3) + spoken_languages + Comp_Universal + Comp_Paramount + Comp_Warner + Comp_20fox + Comp_Columbia + Comp_NewLine + Comp_Disney + Comp_VillageRoadshow + Comp_Miramax + Comp_Pixar + Comp_RelMedia + Comp_MGM + Comp_DreamWorks + Comp_Touchstone + Comp_CanalPlus + genere ; AIC= 4460.939
Step:10 vote_average ~ s(budget, df = 4) + s(popularity, df = 4) + production_countries + s(release_date, df = 2) + s(revenue, df = 2) + s(runtime, df = 3) + spoken_languages + Comp_Universal + Comp_Paramount + Comp_Warner + Comp_20fox + Comp_Columbia + Comp_NewLine + Comp_Disney + Comp_VillageRoadshow + Comp_Miramax + Comp_Pixar + Comp_RelMedia + Comp_MGM + Comp_DreamWorks + Comp_Touchstone + Comp_CanalPlus + genere ; AIC= 4458.619
Step:11 vote_average ~ s(budget, df = 4) + s(popularity, df = 4) + production_countries + s(release_date, df = 2) + s(revenue, df = 3) + s(runtime, df = 3) + spoken_languages + Comp_Universal + Comp_Paramount + Comp_Warner + Comp_20fox + Comp_Columbia + Comp_NewLine + Comp_Disney + Comp_VillageRoadshow + Comp_Miramax + Comp_Pixar + Comp_RelMedia + Comp_MGM + Comp_DreamWorks + Comp_Touchstone + Comp_CanalPlus + genere ; AIC= 4456.526
Step:12 vote_average ~ s(budget, df = 4) + s(popularity, df = 4) + production_countries + s(release_date, df = 2) + s(revenue, df = 3) + s(runtime, df = 3) + spoken_languages + Comp_Universal + Comp_Paramount + Comp_Warner + Comp_20fox + Comp_Columbia + Comp_NewLine + Comp_Disney + Comp_VillageRoadshow + Comp_Miramax + Comp_Pixar + Comp_RelMedia + Comp_DreamWorks + Comp_Touchstone + Comp_CanalPlus + genere ; AIC= 4454.492
Step:13 vote_average ~ s(budget, df = 4) + s(popularity, df = 4) + production_countries + s(release_date, df = 2) + s(revenue, df = 3) + s(runtime, df = 3) + spoken_languages + Comp_Paramount + Comp_Warner + Comp_20fox + Comp_Columbia + Comp_NewLine + Comp_Disney + Comp_VillageRoadshow + Comp_Miramax + Comp_Pixar + Comp_RelMedia + Comp_DreamWorks + Comp_Touchstone + Comp_CanalPlus + genere ; AIC= 4452.51
Step:14 vote_average ~ s(budget, df = 4) + s(popularity, df = 4) + production_countries + s(release_date, df = 2) + s(revenue, df = 3) + s(runtime, df = 3) + spoken_languages + Comp_Paramount + Comp_Warner + Comp_20fox + Comp_Columbia + Comp_Disney + Comp_VillageRoadshow + Comp_Miramax + Comp_Pixar + Comp_RelMedia + Comp_DreamWorks + Comp_Touchstone + Comp_CanalPlus + genere ; AIC= 4450.537
Step:15 vote_average ~ s(budget, df = 4) + s(popularity, df = 4) + production_countries + s(release_date, df = 2) + s(revenue, df = 3) + s(runtime, df = 3) + spoken_languages + Comp_Paramount + Comp_Warner + Comp_20fox + Comp_Disney + Comp_VillageRoadshow + Comp_Miramax + Comp_Pixar + Comp_RelMedia + Comp_DreamWorks + Comp_Touchstone + Comp_CanalPlus + genere ; AIC= 4448.61
Step:16 vote_average ~ s(budget, df = 4) + s(popularity, df = 4) + production_countries + s(release_date, df = 2) + s(revenue, df = 3) + s(runtime, df = 3) + spoken_languages + Comp_Paramount + Comp_Warner + Comp_20fox + Comp_Disney + Comp_Miramax + Comp_Pixar + Comp_RelMedia + Comp_DreamWorks + Comp_Touchstone + Comp_CanalPlus + genere ; AIC= 4446.756
Step:17 vote_average ~ s(budget, df = 4) + s(popularity, df = 4) + production_countries + s(release_date, df = 2) + s(revenue, df = 3) + s(runtime, df = 3) + spoken_languages + Comp_Paramount + Comp_Warner + Comp_20fox + Comp_Disney + Comp_Miramax + Comp_Pixar + Comp_DreamWorks + Comp_Touchstone + Comp_CanalPlus + genere ; AIC= 4444.949
Step:18 vote_average ~ s(budget, df = 4) + s(popularity, df = 4) + production_countries + s(release_date, df = 3) + s(revenue, df = 3) + s(runtime, df = 3) + spoken_languages + Comp_Paramount + Comp_Warner + Comp_20fox + Comp_Disney + Comp_Miramax + Comp_Pixar + Comp_DreamWorks + Comp_Touchstone + Comp_CanalPlus + genere ; AIC= 4443.306
Step:19 vote_average ~ s(budget, df = 4) + s(popularity, df = 4) + production_countries + s(release_date, df = 3) + s(revenue, df = 3) + s(runtime, df = 3) + spoken_languages + Comp_Paramount + Comp_Warner + Comp_Disney + Comp_Miramax + Comp_Pixar + Comp_DreamWorks + Comp_Touchstone + Comp_CanalPlus + genere ; AIC= 4441.657
Step:20 vote_average ~ s(budget, df = 4) + s(popularity, df = 4) + production_countries + s(release_date, df = 3) + s(revenue, df = 3) + s(runtime, df = 3) + spoken_languages + Comp_Warner + Comp_Disney + Comp_Miramax + Comp_Pixar + Comp_DreamWorks + Comp_Touchstone + Comp_CanalPlus + genere ; AIC= 4440.046
Step:21 vote_average ~ s(budget, df = 4) + s(popularity, df = 4) + production_countries + s(release_date, df = 3) + s(revenue, df = 3) + s(runtime, df = 3) + spoken_languages + Comp_Warner + Comp_Disney + Comp_Miramax + Comp_Pixar + Comp_DreamWorks + Comp_CanalPlus + genere ; AIC= 4438.899
Step:22 vote_average ~ s(budget, df = 4) + s(popularity, df = 4) + production_countries + s(release_date, df = 3) + s(revenue, df = 3) + s(runtime, df = 4) + spoken_languages + Comp_Warner + Comp_Disney + Comp_Miramax + Comp_Pixar + Comp_DreamWorks + Comp_CanalPlus + genere ; AIC= 4437.876
Step:23 vote_average ~ s(budget, df = 4) + s(popularity, df = 4) + production_countries + s(release_date, df = 3) + s(revenue, df = 3) + s(runtime, df = 4) + spoken_languages + Comp_Warner + Comp_Disney + Comp_Pixar + Comp_DreamWorks + Comp_CanalPlus + genere ; AIC= 4436.856
Step:24 vote_average ~ s(budget, df = 4) + s(popularity, df = 4) + production_countries + s(release_date, df = 4) + s(revenue, df = 3) + s(runtime, df = 4) + spoken_languages + Comp_Warner + Comp_Disney + Comp_Pixar + Comp_DreamWorks + Comp_CanalPlus + genere ; AIC= 4436.163
Step:25 vote_average ~ s(budget, df = 4) + s(popularity, df = 4) + production_countries + s(release_date, df = 4) + s(revenue, df = 4) + s(runtime, df = 4) + spoken_languages + Comp_Warner + Comp_Disney + Comp_Pixar + Comp_DreamWorks + Comp_CanalPlus + genere ; AIC= 4435.783
Step:26 vote_average ~ s(budget, df = 4) + s(popularity, df = 4) + production_countries + s(release_date, df = 4) + s(revenue, df = 4) + s(runtime, df = 4) + spoken_languages + Comp_Warner + Comp_Disney + Comp_Pixar + Comp_DreamWorks + genere ; AIC= 4435.674
Step:27 vote_average ~ s(budget, df = 4) + s(popularity, df = 4) + production_countries + s(release_date, df = 4) + s(revenue, df = 4) + s(runtime, df = 4) + spoken_languages + Comp_Warner + Comp_Disney + Comp_DreamWorks + genere ; AIC= 4435.659
summary(g4)
Call: gam(formula = vote_average ~ s(budget, df = 4) + s(popularity,
df = 4) + production_countries + s(release_date, df = 4) +
s(revenue, df = 4) + s(runtime, df = 4) + spoken_languages +
Comp_Warner + Comp_Disney + Comp_DreamWorks + genere, data = data.train,
trace = FALSE)
Deviance Residuals:
Min 1Q Median 3Q Max
-5.23538 -0.35948 0.02259 0.40028 3.51055
(Dispersion Parameter for gaussian family taken to be 0.4085)
Null Deviance: 1696.602 on 2259 degrees of freedom
Residual Deviance: 905.1506 on 2216 degrees of freedom
AIC: 4435.659
Number of Local Scoring Iterations: NA
Anova for Parametric Effects
Df Sum Sq Mean Sq F value Pr(>F)
s(budget, df = 4) 1 61.03 61.028 149.4083 < 2.2e-16 ***
s(popularity, df = 4) 1 147.97 147.968 362.2558 < 2.2e-16 ***
production_countries 1 16.86 16.861 41.2797 1.610e-10 ***
s(release_date, df = 4) 1 64.05 64.054 156.8172 < 2.2e-16 ***
s(revenue, df = 4) 1 38.46 38.458 94.1523 < 2.2e-16 ***
s(runtime, df = 4) 1 189.85 189.849 464.7908 < 2.2e-16 ***
spoken_languages 1 1.47 1.466 3.5898 0.058267 .
Comp_Warner 1 0.75 0.752 1.8402 0.175061
Comp_Disney 1 4.31 4.314 10.5626 0.001171 **
Comp_DreamWorks 1 12.73 12.727 31.1583 2.669e-08 ***
genere 18 115.34 6.408 15.6873 < 2.2e-16 ***
Residuals 2216 905.15 0.408
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Anova for Nonparametric Effects
Npar Df Npar F Pr(F)
(Intercept)
s(budget, df = 4) 3 35.280 < 2.2e-16 ***
s(popularity, df = 4) 3 71.661 < 2.2e-16 ***
production_countries
s(release_date, df = 4) 3 6.294 0.0002999 ***
s(revenue, df = 4) 3 6.285 0.0003035 ***
s(runtime, df = 4) 3 21.216 1.503e-13 ***
spoken_languages
Comp_Warner
Comp_Disney
Comp_DreamWorks
genere
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
AIC(g4)
[1] 4435.659
par(mfrow=c(3,5))
plot(g4, se=T)
# if we want to see better some plot
par(mfrow=c(1,1))
plot(g4, se=T, ask=T)
Make a plot selection (or 0 to exit):
1: plot: s(budget, df = 4) 2: plot: s(popularity, df = 4)
3: plot: production_countries 4: plot: s(release_date, df = 4)
5: plot: s(revenue, df = 4) 6: plot: s(runtime, df = 4)
7: plot: spoken_languages 8: plot: Comp_Warner
9: plot: Comp_Disney 10: plot: Comp_DreamWorks
11: plot: genere 12: plot all terms
13: residuals on 14: rug off
15: se off 16: scale (0)
Boosting-
boost.movies=gbm(vote_average ~ .-vote_classes, data=data.train,
distribution="gaussian", n.trees=5000, interaction.depth=1)
# for the plot
par(mfrow=c(1,1))
# plot of training error
plot(boost.movies$train.error, type="l", ylab="training error")
# always decreasing with increasing number of trees
# relative influence plot
summary(boost.movies)
# let us modify the graphical parameters to obtain a better plot 1re space on the left
# default vector of parameters
mai.old<-par()$mai
mai.old
[1] 1.02 0.82 0.82 0.42
# new vector
mai.new<-mai.old
# new space on the left
mai.new[2] <- 2.5
mai.new
[1] 1.02 2.50 0.82 0.42
# modify graphical parameters
par(mai=mai.new)
summary(boost.movies, las=1)
# las=1 horizontal names on y
summary(boost.movies, las=1, cBar=10)
# cBar defines how many variables
# back to orginal window
par(mai=mai.old)
# test set prediction for every iteration (1:5000)
yhat.boost=predict(boost.movies, newdata=data.test, n.trees=1:5000)
# calculate the error for each iteration
# use 'apply' to perform a 'cycle for'
# the first element is the matrix we want to use, 2 means 'by column',
# and the third element indicates the function we want to calculate
err = apply(yhat.boost, 2, function(pred) mean((data.test$vote_average - pred)^2))
plot(err, type="l")
# error comparison (train e test)
plot(boost.movies$train.error, type="l")
lines(err, type="l", col=2)
# minimum error in test set
best=which.min(err)
abline(v=best, lty=2, col=4)
min(err) #minimum error
[1] 0.4598964
Boosting - Deeper trees
boost.movies=gbm(vote_average ~ .-vote_classes, data=data.train,
distribution="gaussian", n.trees=5000, interaction.depth=4)
plot(boost.movies$train.error, type="l")
summary(boost.movies, las=1, cBar=10)
yhat.boost=predict(boost.movies ,newdata=data.test,n.trees=1:5000)
err = apply(yhat.boost,2,function(pred) mean((data.test$vote_average-pred)^2))
plot(err, type="l")
plot(boost.movies$train.error, type="l")
lines(err, type="l", col=2)
best=which.min(err)
abline(v=best, lty=2, col=4)
min(err)
[1] 0.4327901
Boosting - Smaller learning rate
boost.movies=gbm(vote_average ~ .-vote_classes, data=data.train,
distribution="gaussian", n.trees=5000, interaction.depth=1, shrinkage=0.01)
plot(boost.movies$train.error, type="l")
par(mai=mai.new)
summary(boost.movies, las=1, cBar=10)
par(mai=mai.old)
yhat.boost=predict(boost.movies ,newdata=data.test,n.trees=1:5000)
err = apply(yhat.boost,2,function(pred) mean((data.test$vote_average-pred)^2))
plot(err, type="l")
plot(boost.movies$train.error, type="l")
lines(err, type="l", col=2)
best=which.min(err)
abline(v=best, lty=2, col=4)
min(err)
[1] 0.4568311
Boosting - combination of previous models
boost.movies=gbm(vote_average ~ .-vote_classes ,data=data.train,
distribution="gaussian",n.trees=5000, interaction.depth=4, shrinkage=0.01)
plot(boost.movies$train.error, type="l")
#
par(mai=mai.new)
summary(boost.movies, las=1, cBar=10)
par(mai=mai.old)
yhat.boost=predict(boost.movies ,newdata=data.test,n.trees=1:5000)
err = apply(yhat.boost, 2, function(pred) mean((data.test$vote_average-pred)^2))
plot(err, type="l")
plot(boost.movies$train.error, type="l")
lines(err, type="l", col=2)
best=which.min(err)
abline(v=best, lty=2, col=4)
err.boost= min(err)
# Comparison of models in terms of residual deviance
dev.gbm<- (sum((yhat.boost[,best]-data.test$vote_average)^2))
# dev.gbm
# dev.gam
# dev.lm
boost.movies
gbm(formula = vote_average ~ . - vote_classes, distribution = "gaussian",
data = data.train, n.trees = 5000, interaction.depth = 4,
shrinkage = 0.01)
A gradient boosted model with gaussian loss function.
5000 iterations were performed.
There were 23 predictors of which 22 had non-zero influence.
# partial dependence plots
plot(boost.movies, i.var=1, n.trees = best)
plot(boost.movies, i.var=2, n.trees = best)
plot(boost.movies, i.var=5, n.trees = best)
plot(boost.movies, i.var=c(1,5), n.trees = best) #bivariate (library(viridis) may be necessary)
#
plot(boost.movies, i.var=3, n.trees = best) # categorical
plot(boost.movies, i.var=6, n.trees = best)
plot(boost.movies, i=23, n.trees = best)# categorical
plot(boost.movies, i=17, n.trees = best) #no effect